suppressWarnings(suppressPackageStartupMessages({
    library(dplyr)
    library(magrittr)
    library(data.table)
    library(ggplot2)
    library(tidyr)
    library(knitr)
    library(pander)
}))
opts_chunk$set(cache=TRUE, echo=TRUE, message=FALSE, warning=FALSE, dev="pdf")

Read in data

load("results/cellcount_coloc.rdata")
load("results/cellcount_mr.rdata")
mrivw$fdr <- p.adjust(mrivw$pval, "fdr")
wr$fdr <- p.adjust(wr$pval, "fdr")

Significant ivw

mrivw_sig <- subset(mrivw, fdr < 0.05)
nrow(mrivw_sig) / nrow(mrivw)
## [1] 0.01630987

Significant hits more likely when number of instruments is larger

table(mrivw_sig$nsnp) / table(mrivw$nsnp)
## 
##          2          3          4          5          6          7          8 
## 0.01584892 0.01602472 0.02361833 0.03318803 0.02875790 0.05765199 0.01176471 
##          9 
## 0.17647059
summary(glm(I(fdr < 0.05) ~ nsnp, mrivw, family="binomial")) %>% pander::pander()
  Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.464 0.01933 -231 0
nsnp 0.157 0.007893 19.89 4.928e-88

(Dispersion parameter for binomial family taken to be 1 )

Null deviance: 370938 on 2226319 degrees of freedom
Residual deviance: 370572 on 2226318 degrees of freedom

Significant wr

wr_sig <- subset(wr, fdr < 0.05 & nsnp == 1)
table(wr_sig$method) %>% pander::pander()
Inverse variance weighted Wald ratio
0 574891
table(wr_sig$nsnp) %>% pander::pander()
1
574891
table(wr$nsnp) %>% pander::pander()
1 2 3
9567947 5188 35

Check that the WR and coloc analyses have the same data

res$id.exposure <- paste(res$variant, res$cpg)
rescheck <- subset(res, paste(id.exposure, trait) %in% paste(wr_sig$id.exposure, wr_sig$id.outcome))
dim(rescheck) %>% pander::pander()

574891 and 15

dim(wr_sig) %>% pander::pander()

574891 and 10

Coloc

table(rescheck$nsnps > 10) %>% pander::pander()
FALSE TRUE
60628 513203
table(rescheck$PP.H4.abf > 0.8 & rescheck$nsnps > 10) %>% pander::pander()
FALSE TRUE
479093 94738

Comparing colocalising associations against secondary mQTLs

Find colocalising hits

wr <- wr %>% tidyr::separate(exposure, sep=" ", into=c("variant", "cpg"), remove=FALSE)

# Get significant wr
wr_sig <- subset(wr, fdr < 0.05)

# Which colocalise?
wr_coloc <- inner_join(
    wr_sig,
    subset(res, PP.H4.abf > 0.8 & nsnps > 10),
    by=c("id.exposure", "outcome"="trait")
)

Which of these cpgs have a secondary mqtl?

cpg_trait <- unique(paste(wr_coloc$cpg.x, wr_coloc$outcome))
wr_cpg_trait <- subset(wr, paste(cpg, outcome) %in% cpg_trait)
wr_cpg_trait$cpgtrait <- paste(wr_cpg_trait$cpg, wr_cpg_trait$outcome)
table(table(wr_cpg_trait$cpgtrait)) %>% pander::pander()
1 2 3 4 5 6 7 8 9
42126 22067 14897 6975 3385 1416 354 68 15

Separate the primary and separate coloc signals

wr_cpg_trait_primary <- subset(wr_cpg_trait, paste(id.exposure, outcome) %in% paste(wr_coloc$id.exposure, wr_coloc$id.outcome))

wr_cpg_trait_secondary <- subset(wr_cpg_trait, !paste(id.exposure, outcome) %in% paste(wr_coloc$id.exposure, wr_coloc$id.outcome))

For every cpg-trait pair, meta analyse the results to get a single assoc. Essentially creating the IVW fixed effects estimate from the Wald ratios

cpg_trait_list <- unique(paste(wr_cpg_trait$cpg, wr_cpg_trait$outcome))
metafn <- function(x)
{
    m <- meta::metagen(x$b, x$se)
    y <- tibble(
        nsnp=nrow(x),
        b=m$TE.fixed,
        se=m$seTE.fixed,
        pval=m$pval.fixed
    )
    y
}
out <- wr_cpg_trait_secondary %>% 
    group_by(cpgtrait) %>%
    do(metafn(.))

Combine primary and secondary results

ps <- inner_join(wr_cpg_trait_primary, out, by="cpgtrait")
ps$fdr.y <- p.adjust(ps$pval.y, "fdr")
ps$fdr.x <- p.adjust(ps$pval.x, "fdr")
save(ps, file="results/primary-secondary.rdata")

Check all primary associations have fdr < 0.05

table(ps$fdr.x < 0.05) %>% pander::pander()
TRUE
50782

How many secondary associations have fdr < 0.05?

table(ps$fdr.y < 0.05) %>% pander::pander()
FALSE TRUE
35409 15373

We should remove chromosome 6 to be careful about MHC region

table(ps$fdr.y[!grepl("6:", ps$variant)] < 0.05) %>% pander::pander()
FALSE TRUE
23238 4900

Looks like a lot of this is from the MHC region.

Are the primary and secondary effects correlated?

summary(lm(b.y ~ b.x, ps)) %>% pander::pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.001699 0.0003603 4.715 2.425e-06
b.x 0.08542 0.001436 59.47 0
Fitting linear model: b.y ~ b.x
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
50782 0.08119 0.06512 0.0651

This effect attenuates sharply after removing MHC

summary(lm(b.y ~ b.x, ps %>% subset(., !grepl("6:", variant)))) %>% pander::pander()
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.0006889 0.0003654 1.885 0.05942
b.x 0.03129 0.00202 15.49 6.276e-54
Fitting linear model: b.y ~ b.x
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
28138 0.06129 0.008461 0.008426

Look at the plots

ggplot(ps %>% subset(., !grepl("6:", variant)), aes(x = b.x, y = b.y)) +
geom_point(aes(colour=fdr.y < 0.05)) +
geom_smooth(method="lm") +
labs(x="Primary causal estimate", y="Secondary causal estimate", colour="Secondary\nFDR < 0.05")

A lot of significant results secondary effects, but not much of a relationship between primary and secondary hits.

How does it compare by splitting by number of instruments used for secondary association?

ggplot(ps %>% subset(., !grepl("6:", variant)), aes(x = b.x, y = b.y)) +
geom_point(aes(colour=fdr.y < 0.05)) +
geom_smooth(method="lm") +
facet_wrap(~ nsnp.y) +
labs(x="Primary causal estimate", y="Secondary causal estimate", colour="Secondary\nFDR < 0.05")

What are the primary-secondary associations?

assocfn <- function(x)
{
    mod <- summary(lm(b.y ~ b.x, x))$coefficients
    data.frame(nassoc=nrow(x), b=mod[2,1], se=mod[2,2], p=mod[2,4]) 
}

ps %>% 
    subset(., !grepl("6:", variant)) %>%
    group_by(nsnp=nsnp.y) %>%
    do(assocfn(.)) %>%
    pander()
nsnp nassoc b se p
1 15357 0.04754 0.003452 6.678e-43
2 8519 0.01797 0.00262 7.324e-12
3 3066 0.01787 0.003329 8.596e-08
4 917 -0.01366 0.005672 0.01622
5 270 0.02947 0.009731 0.002701
6 9 -0.02171 0.01542 0.2021

Group by blood cell trait category

load("bc_type/bc_type.rdata")
bcm$id <- gsub("ebi-a-", "", bcm$id)
stopifnot(all(ps$id.outcome %in% bcm$id))
ps <- inner_join(ps, subset(bcm, select=c(trait, id, type)), by=c("id.outcome"="id"))

ggplot(ps %>% subset(., !grepl("6:", variant) & fdr.y < 0.05), aes(x = b.x, y = b.y)) +
geom_point(aes(colour=type)) +
geom_smooth(method="lm", aes(color=type)) +
labs(x="Primary causal estimate", y="Secondary causal estimate", colour="Secondary\nFDR < 0.05") +
scale_colour_brewer(type="qual", palette=3)
ggplot(ps %>% subset(., !grepl("6:", variant)), aes(x = b.x, y = b.y)) +
geom_point(aes(colour=type)) +
geom_smooth(method="lm", aes(color=type)) +
labs(x="Primary causal estimate", y="Secondary causal estimate", colour="Secondary\nFDR < 0.05") +
scale_colour_brewer(type="qual", palette=3)

Coefficients per type

ps %>% 
    subset(., !grepl("6:", variant) & fdr.y < 0.05) %>%
    group_by(type=type) %>%
    do(assocfn(.)) %>%
    pander()
type nassoc b se p
Lymphoid 960 0.06633 0.0189 0.0004683
Myeloid 1702 0.1027 0.01347 4.143e-14
Platelet 865 0.07294 0.0195 0.0001955
Red blood cell 1373 0.1953 0.02217 3.656e-18
ps %>% 
    subset(., !grepl("6:", variant)) %>%
    group_by(type=type) %>%
    do(assocfn(.)) %>%
    pander()
type nassoc b se p
Lymphoid 6279 0.01929 0.003735 2.474e-07
Myeloid 10289 0.02607 0.002961 1.49e-18
Platelet 4181 0.0223 0.004679 1.947e-06
Red blood cell 7389 0.06347 0.00542 2.119e-31

Coefficients per study

ps %>% 
    subset(., !grepl("6:", variant)) %>%
    group_by(type=trait) %>%
    do(assocfn(.)) %>%
    arrange(desc(b)) %>%
    pander()
type nassoc b se p
Hematocrit 384 0.1075 0.0242 1.163e-05
Hemoglobin concentration 313 0.1001 0.02891 0.0006139
Immature fraction of reticulocytes 358 0.0973 0.02656 0.0002865
Mean corpuscular hemoglobin concentration 406 0.09513 0.0182 2.753e-07
High light scatter reticulocyte percentage of red cells 565 0.07705 0.0204 0.0001753
Reticulocyte count 504 0.07481 0.02162 0.0005843
High light scatter reticulocyte count 566 0.07246 0.02015 0.000351
Reticulocyte fraction of red cells 492 0.06241 0.02311 0.007165
Platelet distribution width 241 0.06047 0.02237 0.007356
Mean corpuscular volume 1532 0.05421 0.0127 2.082e-05
Red blood cell count 1017 0.05015 0.01213 3.835e-05
Mean corpuscular hemoglobin 1252 0.04777 0.01343 0.0003876
Eosinophil counts 847 0.04727 0.01032 5.365e-06
Sum basophil neutrophil counts 1439 0.04359 0.008549 3.873e-07
Eosinophil percentage of white cells 928 0.03747 0.009325 6.336e-05
Mean platelet volume 1191 0.03629 0.009348 0.0001092
Eosinophil percentage of granulocytes 952 0.03584 0.008809 5.117e-05
White blood cell count (basophil) 410 0.03391 0.01397 0.01567
Neutrophil percentage of granulocytes 904 0.03347 0.008491 8.712e-05
Basophil percentage of white cells 570 0.03256 0.0123 0.008353
Platelet count 1272 0.02486 0.008316 0.002851
Lymphocyte percentage of white cells 902 0.02416 0.008679 0.005495
Neutrophil percentage of white cells 864 0.02262 0.008719 0.009632
Monocyte count 847 0.02132 0.01162 0.06682
Myeloid white cell count 1119 0.02101 0.009583 0.02854
Basophil percentage of granulocytes 314 0.01761 0.01331 0.1867
Granulocyte count 898 0.01491 0.009825 0.1294
Sum neutrophil eosinophil counts 904 0.01447 0.01027 0.1593
Neutrophil count 848 0.0134 0.009944 0.1783
White blood cell count 1199 0.01332 0.01015 0.1895
Lymphocyte counts 759 0.01142 0.01119 0.308
Monocyte percentage of white cells 1057 0.01123 0.009493 0.2372
Plateletcrit 1477 0.01056 0.007575 0.1636
Granulocyte percentage of myeloid white cells 807 0.01023 0.01064 0.3367